ERCOT-Full Dataset¶
In [1]:
#importing necessary libraries
import seaborn as sns
color_pal = sns.color_palette()
from matplotlib import pyplot as plt
plt.style.use('fivethirtyeight')
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import datetime
import datetime
from sklearn.model_selection import TimeSeriesSplit
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.graph_objects as go
import scipy.stats as stats
1. Loading/Cleaning Weather Data¶
- (1.1) reading weather file from: Jan 2015- Jan 2017
- (1.2) adding in data from: Jan 2017- Jan 2023
- (1.3) adding in data from: Jan 2023-Present
- (1.4) cleaning up weather data: removing null values, converting date and setting to index in DateTime format
In [2]:
#(1.1) reading weather file from: Jan 2015- Jan 2017
weatherdf = pd.read_csv("./Weather_Data/2015-2017.csv")
#(1.2) adding in data from: Jan 2017- Jan 2023
for year in range(17,23,2):
weatherdf = pd.concat([weatherdf, pd.read_csv(f"./Weather_Data/{2000+year}-{2002+year}.csv")], axis=0)
#(1.3) adding in data from: Jan 2023-Present
weatherdf = pd.concat([weatherdf, pd.read_csv(f"./Weather_Data/2023.csv")], axis=0)
#(1.4) cleaning up weather data: removing null values, converting date and setting to index in DateTime format
weatherdf['Hour'] = weatherdf['Hour'].astype(str)
weatherdf["Time"] = weatherdf["Date"] + " " + weatherdf["Hour"] + ":00"
weatherdf["Time"] = pd.to_datetime(weatherdf["Time"])
weatherdf = weatherdf.set_index('Time')
weatherdf = weatherdf.drop(columns=["Date", "Hour"])
weatherdf = weatherdf[ weatherdf["Temp"] != -99]
#displaying summary of weather data
weatherdf.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 74556 entries, 2015-01-01 00:00:00 to 2023-06-30 23:00:00 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Temp 74556 non-null int64 1 Dew Point 74556 non-null int64 2 Heat Index 74556 non-null int64 3 Wind Speed (mph) 74556 non-null int64 4 RH 74556 non-null int64 dtypes: int64(5) memory usage: 3.4 MB
2. Loading/Cleaning Load Data¶
- (2.1) reading native load file from: Jan 2015- Jan 2016
- (2.2) adding in data from: Jan 2016- Present https://www.ercot.com/gridinfo/load/load_hist
- (2.3) first load data clean-up: removing null values, rounding values
- (2.4) handling daylight savings times (delete manually in files)
- (2.5) remove other errors in time such as leading zeros/other inconsistencies, rounding hours
In [3]:
#(2.1) reading native load file from: Jan 2015- Jan 2016
loadf = pd.read_csv("./loads/Native_Load_2015.csv")
#(2.2) adding in data from: Jan 2016- Present
for year in range(16,24):
loadf = pd.concat([loadf, pd.read_csv(f"./loads/Native_Load_{year+2000}.csv")], axis=0)
#(2.3) cleaning up load data: removing null values, rounding values
loadf.info()
loadf = loadf.reset_index(drop=True)
loadf = loadf.dropna()
loadf = loadf.replace(',','', regex=True)
cols = loadf.columns
cols.to_numpy()
cols = np.delete(cols, 0)
for column in cols:
loadf[f"{column}"] = pd.to_numeric(loadf[f"{column}"],errors = 'coerce').round(2)
#(2.4) handling daylight savings times (delete manually in files)
#loadf.index[loadf['Hour Ending'] == '11/5/2017 02:00 DST']
#Ex: 11/01/2020 02:00 DST, 11/07/2021 02:00 DST, 11/06/2022 02:00 DST
#(2.5) remove other errors in time such as leading zeros/other inconsistencies, rounding hours
listOfHours = loadf["Hour Ending"].tolist()
#going through each time individually and checking for/fixing bugs
for list in range(len(listOfHours)):
#midnight error in 2023 file
if len(listOfHours[list]) < 6:
date2 = pd.to_datetime(listOfHours[list-1], format='%m/%d/%y %H:%M') + timedelta(minutes=59)
date2str = date2.strftime('%m/%d/%y %H:%M')
listOfHours[list] = date2str
#leading zero error for month
if(listOfHours[list][0] == '0'):
listOfHours[list] = listOfHours[list][1:]
#leading zero error for date
if ('/0' in listOfHours[list]):
listOfHours[list] = listOfHours[list].replace("/0", "/" )
#abbreviating years
if ('/2021' in listOfHours[list]):
listOfHours[list] = listOfHours[list].replace("/2021", "/21" )
if ('/2022' in listOfHours[list]):
listOfHours[list] = listOfHours[list].replace("/2022", "/22" )
if ('/2023' in listOfHours[list]):
listOfHours[list] = listOfHours[list].replace("/2023", "/23" )
#changing midnight to be consistent (0:00 not 24:00)
if ('24:00' in listOfHours[list]):
listOfHours[list] = listOfHours[list].replace(" 24:00", " 0:00")
#any other trailing zeros
if(not ' 0:' in listOfHours[list]):
if(not ' 0' in listOfHours[list]):
listOfHours[list] = listOfHours[list].replace(" 0", " ")
#updating times and setting to index in DateTime format
loadf["Hour Ending"] = listOfHours
loadf = loadf.reset_index(drop=True)
loadf["Hour Ending"] = loadf["Hour Ending"].apply(lambda x: pd.to_datetime(x, format='%m/%d/%y %H:%M').round('h'))
pd.to_datetime(loadf["Hour Ending"], format='%m/%d/%y %H:%M')
loadf = loadf.set_index("Hour Ending").rename_axis('Time', axis='index')
#displaying summary of load data
<class 'pandas.core.frame.DataFrame'> Int64Index: 74464 entries, 0 to 4341 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Hour Ending 74464 non-null object 1 COAST 74463 non-null object 2 EAST 74463 non-null object 3 FWEST 74463 non-null object 4 NORTH 74463 non-null object 5 NCENT 74463 non-null object 6 SOUTH 74463 non-null object 7 SCENT 74463 non-null object 8 WEST 74463 non-null object 9 ERCOT 74463 non-null object dtypes: object(10) memory usage: 6.2+ MB
3. Merging and Saving Datasets¶
- (3.1) merging data and saving to "ERCOT-Full_Data.csv"
In [4]:
#(3.1) merging data and saving to "ERCOT_DATA.csv"
finaldata = pd.merge(weatherdf, loadf["ERCOT"], left_index=True, right_index=True)
#dropping null values
finaldata = finaldata.dropna()
#setting index
finaldata.rename_axis('Time', axis='index')
finaldata.to_csv('ERCOT-Full_Data.csv')
4. Setting up the Model¶
- (4.1) re-reading data
- (4.2) defining characteristics
- (4.3) adding time lags characteristic
In [5]:
#(4.1) re-reading data
df = pd.read_csv("./ERCOT-Full_Data.csv")
df["Time"] = pd.to_datetime(df["Time"])
df = df.set_index("Time")
df.rename_axis('DATE', axis='index')
#(4.2) defining characteristics
def create_features(df):
"""
Create time series features based on time series index.
"""
df = df.copy()
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['quarter'] = df.index.quarter
df['month'] = df.index.month
df['year'] = df.index.year
df['dayofyear'] = df.index.dayofyear
df['dayofmonth'] = df.index.day
df['weekofyear'] = df.index.isocalendar().week
return df
df = create_features(df)
#(4.3) adding time lags characteristic
def add_lags(df):
target_map = df['ERCOT'].to_dict()
df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
df['lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
df['lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
return df
df = add_lags(df)
Plots¶
In [6]:
#displaying correlations with a heatmap
sns.heatmap(df.corr())
Out[6]:
<AxesSubplot:>
In [7]:
#plotting MW by hour
fig, ax = plt.subplots(figsize=(10, 4))
sns.boxplot(data=df, x='hour', y='ERCOT', palette='rocket')
ax.set_title('MW by Hour')
plt.show()
In [8]:
#plotting MW by month
fig, ax = plt.subplots(figsize=(10, 4))
sns.boxplot(data=df, x='month', y='ERCOT', palette='crest')
ax.set_title('MW by Month')
plt.show()
5. Running the Model¶
- (5.1) splitting data to train and test
- (5.2) splitting to train and test, adding time/refining characteristics
- (5.3) defining xgboost regressor function imported from libary
- (5.4) fitting the regression
- (5.5) intepreting and saving output from fold
In [9]:
#(5.1) splitting data to 5 series
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
df = df.sort_index()
fold = 0
preds = []
tests = []
scores = []
dates = []
#looping through each split
for train_idx, val_idx in tss.split(df):
#(5.2) splitting to train and test, adding time/refining characteristics
train = df.iloc[train_idx]
test = df.iloc[val_idx]
train = create_features(train)
test = create_features(test)
FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year','Temp', 'Dew Point', 'Heat Index', 'Wind Speed (mph)', 'RH', 'lag1', 'lag2', 'lag3']
TARGET = 'ERCOT'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
dates.append(X_test.index)
#(5.3) defining xgboost regressor function imported from libary
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1000,
early_stopping_rounds=50,
objective='reg:squarederror',
max_depth=3,
learning_rate=0.01)
#(5.4) fitting the regression
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=100)
#(5.5) intepreting and saving output from fold
y_pred = reg.predict(X_test)
tests.append(y_test)
preds.append(y_pred)
score = np.sqrt(mean_squared_error(y_test, y_pred))
scores.append(score)
[0] validation_0-rmse:40992.96724 validation_1-rmse:43167.60046 [100] validation_0-rmse:15403.39421 validation_1-rmse:17430.72870 [200] validation_0-rmse:6265.62196 validation_1-rmse:7792.90386 [300] validation_0-rmse:3256.90206 validation_1-rmse:4188.86569 [400] validation_0-rmse:2383.43343 validation_1-rmse:2777.52292 [500] validation_0-rmse:2099.05960 validation_1-rmse:2285.79443 [600] validation_0-rmse:1968.81538 validation_1-rmse:2099.68923 [700] validation_0-rmse:1892.74437 validation_1-rmse:2005.35838 [800] validation_0-rmse:1834.74048 validation_1-rmse:1945.35235 [900] validation_0-rmse:1787.39542 validation_1-rmse:1907.37107 [999] validation_0-rmse:1747.59078 validation_1-rmse:1882.96587 [0] validation_0-rmse:41483.80374 validation_1-rmse:44487.98210 [100] validation_0-rmse:15593.53524 validation_1-rmse:17661.01302 [200] validation_0-rmse:6337.64085 validation_1-rmse:7720.97180 [300] validation_0-rmse:3282.91072 validation_1-rmse:4164.97081 [400] validation_0-rmse:2409.40580 validation_1-rmse:2952.37406 [500] validation_0-rmse:2135.70714 validation_1-rmse:2549.31529 [600] validation_0-rmse:2008.15912 validation_1-rmse:2375.91870 [700] validation_0-rmse:1929.85562 validation_1-rmse:2284.38968 [800] validation_0-rmse:1872.13117 validation_1-rmse:2228.53187 [900] validation_0-rmse:1823.04984 validation_1-rmse:2191.59841 [999] validation_0-rmse:1782.85417 validation_1-rmse:2162.00461 [0] validation_0-rmse:42039.84476 validation_1-rmse:44765.59073 [100] validation_0-rmse:15806.75072 validation_1-rmse:17977.56916 [200] validation_0-rmse:6440.25841 validation_1-rmse:7996.06568 [300] validation_0-rmse:3369.16198 validation_1-rmse:4436.98635 [400] validation_0-rmse:2497.30242 validation_1-rmse:3292.11203 [500] validation_0-rmse:2226.43945 validation_1-rmse:2932.50650 [600] validation_0-rmse:2093.95779 validation_1-rmse:2790.43088 [700] validation_0-rmse:2011.74709 validation_1-rmse:2731.90832 [800] validation_0-rmse:1953.26799 validation_1-rmse:2699.67946 [900] validation_0-rmse:1901.43759 validation_1-rmse:2673.18317 [999] validation_0-rmse:1857.52879 validation_1-rmse:2658.05257 [0] validation_0-rmse:42467.65058 validation_1-rmse:47924.67780 [100] validation_0-rmse:15976.97048 validation_1-rmse:20291.80872 [200] validation_0-rmse:6512.80411 validation_1-rmse:10050.41793 [300] validation_0-rmse:3435.83783 validation_1-rmse:6244.60204 [400] validation_0-rmse:2566.78371 validation_1-rmse:4855.50602 [500] validation_0-rmse:2301.16963 validation_1-rmse:4189.71192 [600] validation_0-rmse:2182.17309 validation_1-rmse:3915.02671 [700] validation_0-rmse:2106.11622 validation_1-rmse:3766.12633 [800] validation_0-rmse:2048.87763 validation_1-rmse:3694.68887 [900] validation_0-rmse:1998.30372 validation_1-rmse:3601.00688 [999] validation_0-rmse:1956.07804 validation_1-rmse:3534.29333 [0] validation_0-rmse:43227.28666 validation_1-rmse:49375.15567 [100] validation_0-rmse:16296.42096 validation_1-rmse:20874.38370 [200] validation_0-rmse:6698.35776 validation_1-rmse:9857.74442 [300] validation_0-rmse:3581.44005 validation_1-rmse:5306.01925 [400] validation_0-rmse:2690.67192 validation_1-rmse:3589.58860 [500] validation_0-rmse:2411.77092 validation_1-rmse:3005.81901 [600] validation_0-rmse:2281.39950 validation_1-rmse:2756.06634 [700] validation_0-rmse:2191.46724 validation_1-rmse:2626.17951 [800] validation_0-rmse:2120.88125 validation_1-rmse:2539.85919 [900] validation_0-rmse:2064.50817 validation_1-rmse:2484.17576 [999] validation_0-rmse:2021.03117 validation_1-rmse:2452.50386
Analyzing Results¶
In [10]:
#making csv tracking predictions for each hour and actual results
analysis = pd.DataFrame({'Date': np.concatenate([dates[0], dates[1], dates[2], dates[3], dates[4]]), 'Prediction': np.concatenate([preds[0], preds[1], preds[2], preds[3], preds[4]]), 'Actual': np.concatenate([tests[0].values, tests[1].values, tests[2].values, tests[3].values, tests[4].values])})
analysis["% Error"] = ((analysis["Prediction"]-analysis["Actual"])/analysis["Actual"]).abs()*100
analysis.to_csv('ERCOT-Full_Results.csv')
print("% Error Statistics:")
#printing error metrics, box-plot
print(analysis["% Error"].describe())
sns.boxplot(x=analysis["% Error"])
#calculating and displaying rmse
print(f'Model RMSE: {np.mean(scores):0.4f}')
% Error Statistics: count 43800.000000 mean 4.287299 std 3.508268 min 0.000130 25% 1.623331 50% 3.461501 75% 6.097114 max 41.689184 Name: % Error, dtype: float64 Model RMSE: 2537.8785
In [11]:
#displaying each factor's importance
importances = pd.DataFrame(data=reg.feature_importances_,
index=reg.feature_names_in_,
columns=['importance'])
importances.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()
#scatter and line chart to trace difference in predictions and actual
fig = go.Figure()
fig.update_layout(
title="Prediction vs Actual Scatter",
)
fig.add_trace(go.Scatter(x=analysis.head(1000).index, y=analysis["Prediction"].head(1000),
mode='markers',
name='Prediction'))
fig.add_trace(go.Scatter(x=analysis.head(1000).index, y=analysis["Actual"].head(1000),
mode='lines',
name='Actual'))
fig.show()
#histogram to trace difference in predictions and actual
fig = go.Figure()
fig.add_trace(go.Histogram(x=analysis["Prediction"], name='Prediction'))
fig.add_trace(go.Histogram(x=analysis["Actual"], name='Actual'))
fig.update_layout(
title="Histogram of Model Results",
)
fig.show()
7. Predicting the Future: July 2023¶
- (7.1) define future dataframe (set date)
- (7.2) adding future to full-dataset, making copy, adding all factors
- set features for forecast
- (7.3) making predictions for the future, plotting and saving them
In [12]:
#(7.1) define future dataframe (set date)
future = pd.date_range('2023-07-01','23:00 2023-07-31', freq='1h')
future_df = pd.DataFrame(index=future)
#(7.2) adding future to full-dataset, making copy, adding all factors
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)
future_w_features = df_and_future.query('isFuture').copy()
#*set features for forecast
future_w_features["Temp"]
future_w_features["Dew Point"]
future_w_features["Heat Index"]
#(7.3) making predictions for the future, plotting and saving them
future_w_features['Prediction'] = reg.predict(future_w_features[FEATURES])
future_w_features['Prediction'].plot(figsize=(10, 5),
color=color_pal[4],
ms=1,
lw=1,
title='Future Predictions')
#saving to a Prediction csv
future_w_features['Prediction'].to_csv('ERCOT-Full_Predictions.csv')
#saving to a full csv
source = pd.DataFrame(columns=['Prediction', 'Actual', '% Error'])
source["Prediction"] = future_w_features['Prediction']
source = source.reset_index()
source = source.fillna(0)
source.rename(columns = {'index':'Date'}, inplace = True)
pred_and_past = pd.concat([analysis, source], axis=0)
pred_and_past.to_csv('ERCOT-Full_Past-Predictions.csv')
#showing plot of predictions
plt.show()